#--------------------------------------------------------------------------------#

# Online Appendix ####

#--------------------------------------------------------------------------------#

# Authors: WANG, Howard; CHENG, Edmund; CHEN, Xi; LIANG, Hai
# Version: 2024-Oct-30
# R version 4.4.1 (2024-06-14)
# Running under: macOS 15.1


rm(list = ls())

#Set directory#
(code_file_path = rstudioapi::getActiveDocumentContext()$path) # Get current path of this code file
setwd(dirname(code_file_path)) # dirname(): reach the upper level folder
getwd() # current directory


# install.packages("tidyverse")
library(tidyverse) # sometimes we need to re-install the tidyverse package, or we cannot use "%>% select"
# library(dplyr) 
library(stringr)
library(fixest)
library(lme4) # for using glmer
library(sjPlot) # for tab_model
library(sjmisc)
library(sjstats)
library(jiebaR)
# Solve the problem of conflict packages
library(conflicted)
conflict_prefer("select", "dplyr") # To ensure that when using select, it refers to dplyr


# Prepare to show Chinese characters
if (!require("showtext")) {install.packages("showtext")}
library(showtext)
showtext_auto(enable = T)



# Load data #
load(file="data/llmb_multilevel_data_final.Rdata")
d=llmb_multilevel_data_final




#--------------------------------------------------------------------------------#
# Online Appendix A: Background* -------------------------------------------------------
#--------------------------------------------------------------------------------#

# In appendix A No tables and figures are generated by codes. 







#_____________####
#--------------------------------------------------------------------------------#
# Online Appendix B: Descriptive analyses* -------------------------------------------------------
#--------------------------------------------------------------------------------#

## Figure B1 ####
dim(d)
table(is.na(d$typeName)) # no missing value
table(d$typeName)
# 咨询  建言  感谢  投诉  求助 
# 40245 22965  2181 87401 80212 

df_type = table(d$typeName) |> as.data.frame()
df_type$Types_eng = c("Consultation", "Suggestion", "Thank", "Complaint", "Help-seeking")
df_type = df_type |> rename(Types=Var1)
percentage <- scales::percent(df_type$Freq / sum(df_type$Freq)) 
percentage
labs <- paste(df_type$Types,df_type$Types_eng, '(', percentage, ')', sep = '')
labs

df_type %>%
  ggplot(aes(x = '', y = Freq, fill = Types)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_text(aes(y = cumsum(rev(Freq))-round(rev(Freq)*2/3), label = labs[seq(length(labs), 1, -1)]),size = 2) +
  theme_bw() +
  labs(x = '', y = '',title = 'Distribution of Post Types') +
  coord_polar(theta = 'y', start = 0, direction = 1)
ggsave("output/appendix/Figure B1 Claim type distribution.pdf", height = 5, width = 7.5)




## Figure B2 ####
df_topic = table(d$topic) |> as.data.frame()
colnames(df_topic) <- c('Topics', 'Freq')
percentage <- scales::percent(df_topic$Freq / sum(df_topic$Freq)) 
percentage
# [1] "4.368%"  "4.766%"  "3.978%"  "49.807%" "7.318%"  "7.602%"  "0.551%"  "4.702%"  "0.612%"  "2.892%"  "3.139%" 
# [12] "1.157%"  "9.108%" 
labs <- paste(df_topic$Topics, '(', percentage, ')', sep = '')
labs
# [1] "Administration(4.368%)" "Agriculture(4.766%)"    "Company(3.978%)"        "Construction(49.807%)" 
# [5] "Education(7.318%)"      "Employment(7.602%)"     "Entertainment(0.551%)"  "Environment(4.702%)"   
# [9] "Finance(0.612%)"        "Medicine(2.892%)"       "Security(3.139%)"       "Tourism(1.157%)"       
# [13] "Transportation(9.108%)"
library(ggplot2)
library(magrittr)
df_topic %>% 
  ggplot(aes(x = '', y = Freq, fill = Topics)) + 
  geom_bar(stat = 'identity', width = 1) + 
  geom_text(aes(y = cumsum(rev(Freq))-round(rev(Freq)*2/3), label = labs[seq(length(labs), 1, -1)]), size = 2) + 
  theme_bw() + 
  labs(x = '', y = '',title = 'Distribution of Topic Domains') + 
  theme(
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank()
  )
ggsave("output/appendix/Figure B2 Topics distribution.pdf", height = 5, width = 7.5)





## Figure B3 ####
check=d|>select(replied, referral,direct_info,concrete,action,resolved)



# Summarize the data
d_daily <- d %>% 
  dplyr::group_by(reply_ymd) %>% 
  dplyr::summarise(
    detailed = mean(direct_info, na.rm = TRUE) * 100,
    action = mean(action, na.rm = TRUE) * 100,
    concrete = mean(concrete, na.rm = TRUE) * 100,
    resolved = mean(resolved, na.rm = TRUE) * 100 
  )

# Convert to long format
d_daily_long <- d_daily %>%
  pivot_longer(cols = !reply_ymd, names_to = "type", values_to = "count")

# Ensure type is a factor with the desired order of levels
d_daily_long$type <- factor(d_daily_long$type, 
                            levels = c("concrete", "detailed", "action", "resolved"), 
                            labels = c("Concrete reply", 
                                       "Direct information", 
                                       "Investigation or resolution", 
                                       "Problem resolved"))

pdf(file = "output/appendix/Figure B3. DV Distribution.pdf", width = 10.5, height = 6)
ggplot(d_daily_long, aes(x = reply_ymd, y = count, colour = type, shape = type)) + 
  geom_line(size = 0.3) + 
  geom_point(size = 2) + 
  xlab("Day\nFrom 1 Jan. 2020 to 31 Dec. 2021") +
  ylab("Quality of government response (ratio by day)") +
  scale_x_date(date_labels = "%b%Y", date_breaks = "2 month") +
  scale_color_manual(name = "Type of Responsiveness", 
                     values = c("Concrete reply" = "#07234d", 
                                "Direct information" = "#2a6863", 
                                "Investigation or resolution" = "#efcd6d", 
                                "Problem resolved" = "#d7bdc6")) +
  scale_shape_manual(name = "Type of Responsiveness", 
                     values = c("Concrete reply" = 16,  # Filled circle
                                "Direct information" = 17,  # Filled triangle
                                "Investigation or resolution" = 18,  # Filled diamond
                                "Problem resolved" = 19)) +  # Filled circle
  theme_minimal() +
  theme(legend.title = element_text(size = 16), 
        legend.text = element_text(size = 12),
        axis.title.x = element_text(size = 14),  # Increase xlab title font size
        axis.title.y = element_text(size = 14)) +  # Increase ylab title font size
  labs(caption = "Data sources: Local Leader Message Board (LLMB)")
dev.off()



## Table B1 ####
desc = d
## divide the data into two subsets
desc1 = desc[desc$complaint == 1, ] 
nrow(desc1)
desc2 = desc[desc$complaint == 0, ] 
nrow(desc2) 

### column 1 ####
dt= c(
  # speed variation
  mean(desc$replied_days, na.rm = T), # the average reply speed is 34.60562 days among all the 331 cities between Jan 2020 to Dec 2021 
  # length
  mean(desc$reply_length, na.rm = T),
  # concreteness
  table(desc$concrete)[2]/nrow(desc), # 41.76304% of claim-makings received concrete responsiveness.
  # reply_agency_n
  mean(desc$reply_agency_n, na.rm = T),
  # reply_regular
  table(desc$reply_regular)[2]/nrow(desc),
  # The number of input threads
  nrow(desc),
  # The number of replied threads
  nrow(desc[desc$replied==1,])
)


### column 2 (contentious) ####
dt1= c(
  # speed variation
  mean(desc1$replied_days, na.rm = T), # the average reply speed is 34.60562 days among all the 331 cities between Jan 2020 to Dec 2021 
  # length
  mean(desc1$reply_length, na.rm = T),
  # concreteness
  table(desc1$concrete)[2]/nrow(desc1), # 41.76304% of claim-makings received concrete responsiveness.
  # reply_agency_n
  mean(desc1$reply_agency_n, na.rm = T),
  # reply_regular
  table(desc1$reply_regular)[2]/nrow(desc1),
  # The number of input threads
  nrow(desc1),
  # The number of replied threads
  nrow(desc1[desc1$replied==1,])
)

### column 3 (non-contentious) ####
dt2= c(
  # speed variation
  mean(desc2$replied_days, na.rm = T), 
  # length
  mean(desc2$reply_length, na.rm = T),
  # concreteness
  table(desc2$concrete)[2]/nrow(desc2), 
  # reply_agency_n
  mean(desc2$reply_agency_n, na.rm = T),
  # reply_regular
  table(desc2$reply_regular)[2]/nrow(desc2),
  # The number of input threads
  nrow(desc2),
  # The number of replied threads
  nrow(desc2[desc2$replied==1,])
)

table= data.frame(sample_full=sprintf("%0.2f", dt), sample_complaint=sprintf("%0.2f", dt1), sample_noncomplaint=sprintf("%0.2f", dt2))

rownames(table)= c("replied_days_ave", "reply_length_ave", "concrete_ratio", "reply_agency_n_ave", "reply_regular_ratio", "input_n", "replied_n")
write.csv(table, file="output/appendix/Table B1.csv")










#_____________####
#--------------------------------------------------------------------------------#
# Online appendix C: Main results* -------------------------------------------------------
#--------------------------------------------------------------------------------#



#_____________####
#--------------------------------------------------------------------------------#
# Online appendix D part 1: Robustness * -------------------------------------------------------
#--------------------------------------------------------------------------------#

## Testing measurement bias ####################################

### Table D1: Broad concreteness ####
# In the main text, we define a reply as concrete when the reply is referral information or direct information or spot action. However, now we will take a strict measurement in which concreteness means direct information or spot action (with ignoring "referral"). 

summary(concrete_broad1 <- glmer(concrete_broad~complaint+
                                   (1|cityid)+(0+complaint|cityid), # City cluster
                                 d,
                                 nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial) )


summary(concrete_broad2 <- glmer(concrete_broad~complaint+
                                   post_length_log+phone+topic + # Thread controls
                                   
                                   (1|cityid)+(0+complaint|cityid), # City cluster
                                 d,
                                 nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

summary(concrete_broad3 <- glmer(concrete_broad~complaint+
                                   post_length_log+phone+topic + # Thread controls
                                   male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                                   
                                   (1|cityid)+(0+complaint|cityid), # City cluster
                                 d,
                                 nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


summary(concrete_broad4 <- glmer(concrete_broad~complaint+
                                   post_length_log+phone+topic + # Thread controls
                                   male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                                   gdp_pc_log+pop_log+revenue_log+  # City econ control
                                   
                                   (1|cityid)+(0+complaint|cityid), # City cluster
                                 d,
                                 nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

# main model
summary(concrete_broad5 <- glmer(concrete_broad~complaint+
                                   post_length_log+phone+topic + # Thread controls
                                   male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                                   gdp_pc_log+pop_log+revenue_log+  # City econ control
                                   replied_days+reply_length_log+   # Reply controls
                                   (1|cityid)+(0+complaint|cityid), # City cluster
                                 d,
                                 nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))



## Save concreteness regression results
tab_model(concrete_broad1,concrete_broad2,concrete_broad3,concrete_broad4,concrete_broad5,
          transform=NULL, # not to show odds ratio
          show.ci=F, show.se=T, collapse.se = T, p.style="stars", # one column (p: numeric, stars (asterisk), scientific, numeric_stars)
          digits=2, digits.p=2,
          show.r2= F, show.aic=T,show.loglik=F,show.dev=F,show.obs=T,
          file= "output/appendix/Table D1 Concreteness_broad.html") 




### Table D2: Action as DV ####
# In the main text, we define a reply as concrete when the reply is referral information or direct information or spot action. However, now we will take a strict measurement in which concreteness means direct information or spot action (with ignoring "referral"). 

summary(action1 <- glmer(action~complaint+
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d,
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial) )


summary(action2 <- glmer(action~complaint+
                           post_length_log+phone+topic + # Thread controls
                           
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d,
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

summary(action3 <- glmer(action~complaint+
                           post_length_log+phone+topic + # Thread controls
                           male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                           
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d,
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


summary(action4 <- glmer(action~complaint+
                           post_length_log+phone+topic + # Thread controls
                           male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                           gdp_pc_log+pop_log+revenue_log+  # City econ control
                           
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d,
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

summary(action5 <- glmer(action~complaint+
                           post_length_log+phone+topic + # Thread controls
                           male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                           gdp_pc_log+pop_log+revenue_log+  # City econ control
                           replied_days+reply_length_log+   # Reply controls
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d,
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

## Save concreteness regression results
tab_model(action1,action2,action3,action4,action5,
          transform=NULL, # not to show odds ratio
          show.ci=F, show.se=T, collapse.se = T, p.style="stars", # one column (p: numeric, stars (asterisk), scientific, numeric_stars)
          digits=2, digits.p=2,
          show.r2= F, show.aic=T,show.loglik=F,show.dev=F,show.obs=T,
          file= "output/appendix/Table D2 Action as DV.html") 





### Table D3: Resolved as DV ####
# In the main text, we define a reply as concrete when the reply is referral information or direct information or spot action. 
# Now we will take a strict measurement in which concreteness means direct information or spot action (with ignoring "referral"). 

summary(resolved1 <- glmer(resolved~complaint+
                             (1|cityid)+(0+complaint|cityid), # City cluster
                           d,
                           nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial) )


summary(resolved2 <- glmer(resolved~complaint+
                             post_length_log+phone+topic + # Thread controls
                             
                             (1|cityid)+(0+complaint|cityid), # City cluster
                           d,
                           nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

summary(resolved3 <- glmer(resolved~complaint+
                             post_length_log+phone+topic + # Thread controls
                             male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                             
                             (1|cityid)+(0+complaint|cityid), # City cluster
                           d,
                           nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


summary(resolved4 <- glmer(resolved~complaint+
                             post_length_log+phone+topic + # Thread controls
                             male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                             gdp_pc_log+pop_log+revenue_log+  # City econ control
                             
                             (1|cityid)+(0+complaint|cityid), # City cluster
                           d,
                           nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

summary(resolved5 <- glmer(resolved~complaint+
                             post_length_log+phone+topic + # Thread controls
                             male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                             gdp_pc_log+pop_log+revenue_log+  # City econ control
                             replied_days+reply_length_log+   # Reply controls
                             (1|cityid)+(0+complaint|cityid), # City cluster
                           d,
                           nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

tab_model(resolved1,resolved2,resolved3,resolved4,resolved5,
          transform=NULL, # not to show odds ratio
          show.ci=F, show.se=T, collapse.se = T, p.style="stars", # one column (p: numeric, stars (asterisk), scientific, numeric_stars)
          digits=2, digits.p=2,
          show.r2= F, show.aic=T,show.loglik=F,show.dev=F,show.obs=T,
          file= "output/appendix/Table D3 Resolved as DV.html") 




### Table D4: Reply length as DV ####
## Reply length as the dv and glmer with poisson family model
summary(reply_length_poisson1 <- glmer(reply_length~complaint+
                                         (1|cityid)+(0+complaint|cityid), # City cluster
                                       d,
                                       nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=poisson))


summary(reply_length_poisson2 <- glmer(reply_length~complaint+
                                         post_length_log+phone+topic + # Thread controls
                                         male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                                         gdp_pc_log+pop_log+revenue_log+  # City econ control
                                         replied_days+ # Reply controls
                                         
                                         (1|cityid)+(0+complaint|cityid), # City cluster
                                       d,
                                       nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=poisson))


# ## Reply length as dv using glmer.nb model
summary(reply_length_nb1 <- glmer.nb(reply_length~complaint+
                                       
                                       (1|cityid)+(0+complaint|cityid), # City cluster
                                     d,
                                     nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE) ))


summary(reply_length_nb2 <- glmer.nb(reply_length~complaint+
                                       post_length_log+phone+topic + # Thread controls
                                       male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                                       gdp_pc_log+pop_log+revenue_log+  # City econ control
                                       replied_days+ # Reply controls
                                       (1|cityid)+(0+complaint|cityid), # City cluster
                                     d,
                                     nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE) ))


tab_model(reply_length_poisson1, reply_length_poisson2,
          reply_length_nb1, reply_length_nb2,
          transform=NULL, # not to show odds ratio
          show.ci=F, show.se=T, collapse.se = T, p.style="stars", # one column (p: numeric, stars (asterisk), scientific, numeric_stars)
          digits=2, digits.p=2,
          show.r2= F, show.aic=T,show.loglik=F,show.dev=F,show.obs=T,
          file= "output/appendix/Table D4 Reply length as DV poisson & nb model.html")





### Table D5: Strict complaint as IV #####

summary(complaint_strict1 <- glmer(concrete~complaint_strict+
                                     
                                     (1|cityid)+(0+complaint_strict|cityid), # City cluster
                                   d,
                                   nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial) )


summary(complaint_strict2 <- glmer(concrete~complaint_strict+
                                     post_length_log+phone+topic + # Thread controls
                                     
                                     (1|cityid)+(0+complaint_strict|cityid), # City cluster
                                   d,
                                   nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

summary(complaint_strict3 <- glmer(concrete~complaint_strict+
                                     post_length_log+phone+topic+ # Thread controls
                                     male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                                     
                                     (1|cityid)+(0+complaint_strict|cityid), # City cluster
                                   d,
                                   nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


summary(complaint_strict4 <- glmer(concrete~complaint_strict+
                                     post_length_log+phone+topic+ # Thread controls
                                     male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                                     gdp_pc_log+pop_log+revenue_log+  # City econ control
                                     
                                     (1|cityid)+(0+complaint_strict|cityid), # City cluster
                                   d,
                                   nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

summary(complaint_strict5 <- glmer(concrete~complaint_strict+
                                     post_length_log+phone+topic+ # Thread controls
                                     male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                                     gdp_pc_log+pop_log+revenue_log+  # City econ control
                                     replied_days+reply_length_log+ # Reply controls
                                     (1|cityid)+(0+complaint_strict|cityid), # City cluster
                                   d,
                                   nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


tab_model(complaint_strict1,complaint_strict2,complaint_strict3,complaint_strict4,complaint_strict5,
          transform=NULL, # not to show odds ratio
          show.ci=F, show.se=T, collapse.se = T, p.style="stars", # one column (p: numeric, stars (asterisk), scientific, numeric_stars)
          digits=2, digits.p=2,
          show.r2= F, show.aic=T,show.loglik=F,show.dev=F,show.obs=T,
          file= "output/appendix/Table D5 Strict complaint.html")





## Testing regional heterogeneity ##########################

### Claim and complaint percentage
df= d %>% dplyr::group_by(prefcode, pref_ch)%>%
  dplyr::summarise(original_post_n = n(),
                   complaint_percent= mean(complaint, na.rm = T) )



#### Figure D1a: No outliers subset in terms of citizen claim number ####
boxplot(df[, "original_post_n"])
summary(df$original_post_n)
hist(df$original_post_n)
(Q1 <- quantile(df$original_post_n, .25))
(Q3 <- quantile(df$original_post_n, .75))
(IQR <- IQR(df$original_post_n)) # IQR(x) = quantile(x, 3/4) - quantile(x, 1/4).
IQR== Q3-Q1

df_outliers_claim = subset(df, df$original_post_n <= (Q1 - 1.5*IQR) | df$original_post_n >= (Q3 + 1.5*IQR)) # 29 cities are outliers
df_no_outliers_claim = subset(df, df$original_post_n > (Q1 - 1.5*IQR) & df$original_post_n < (Q3 + 1.5*IQR))


pdf("output/appendix/Figure D1a citizen_claim.pdf") # created a empty pdf
boxplot(df$original_post_n, df_no_outliers_claim$original_post_n,
        main = "Boxplot of Citizen Claim Number",
        xlab = "Left: full samples vs. Right: no outliers\nGroup by Cities",
        ylab = "Citizen claim number in cities") 
dev.off() 

## Data set for regression
d_no_outliers_claim = d[d$prefcode %in% df_no_outliers_claim$prefcode, ] # 135728
dim(d_no_outliers_claim)
length(unique(d$prefcode) ) # 330
length(unique(d_no_outliers_claim$prefcode)) # 301 cities left




#### Figure D1b: No outliers subset in terms of complaint percentage ####
boxplot(df[, "complaint_percent"])
summary(df$complaint_percent)
hist(df$complaint_percent)
(Q1 <- quantile(df$complaint_percent, .25))
(Q3 <- quantile(df$complaint_percent, .75))
(IQR <- IQR(df$complaint_percent)) # IQR(x) = quantile(x, 3/4) - quantile(x, 1/4).
IQR== Q3-Q1

df_outliers_complaint = subset(df, df$complaint_percent <= (Q1 - 1.5*IQR) | df$complaint_percent >= (Q3 + 1.5*IQR)) # 7 counties are outliers 
df_no_outliers_complaint = subset(df, df$complaint_percent > (Q1 - 1.5*IQR) & df$complaint_percent < (Q3 + 1.5*IQR))



pdf("output/appendix/Figure D1b complaint percent.pdf") # created a empty pdf
boxplot(df$complaint_percent, df_no_outliers_complaint$complaint_percent,
        main = "Boxplot of Complaint Percentage",
        xlab = "Left: full samples vs. Right: no outliers\nGroup by Cities",
        ylab = "Complaint percentage in cities") 
dev.off() 

## Data set for regression
d_no_outliers_complaint = d[d$prefcode %in% df_no_outliers_complaint$prefcode, ]  # 232192

length(unique(d$prefcode) ) # 330
length(unique(d_no_outliers_complaint$prefcode)) # 323 cities left



# no outliers in terms of citizen claim number

summary(d_no_outliers_claim1 <- glmer(concrete~complaint+ 
                                        
                                        (1|cityid)+(0+complaint|cityid), # City cluster
                                      d_no_outliers_claim,  # no outliers
                                      nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

summary(d_no_outliers_claim2<- glmer(concrete~complaint+ 
                                       post_length_log+phone+topic + # Thread controls
                                       male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                                       gdp_pc_log+pop_log+revenue_log+  # City econ control
                                       replied_days+reply_length_log+  # Reply controls
                                       
                                       (1|cityid)+(0+complaint|cityid), # City cluster
                                     d_no_outliers_claim,  # no outliers
                                     nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))



# no outliers in terms of complaint percentage
summary(d_no_outliers_complaint1 <- glmer(concrete~complaint+ 
                                            
                                            (1|cityid)+(0+complaint|cityid), # City cluster
                                          d_no_outliers_complaint,  # no outliers
                                          nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


summary(d_no_outliers_complaint2<- glmer(concrete~complaint+
                                           post_length_log+phone+topic + # Thread controls
                                           male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                                           gdp_pc_log+pop_log+revenue_log+  # City econ control
                                           replied_days+reply_length_log+   # Reply controls
                                           (1|cityid)+(0+complaint|cityid), # City cluster
                                         d_no_outliers_complaint,  # no outliers
                                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


### Table D6 ####
## Save concreteness regression results
tab_model(d_no_outliers_claim1, d_no_outliers_claim2, d_no_outliers_complaint1,d_no_outliers_complaint2,
          transform=NULL, # not to show odds ratio
          show.ci=F, show.se=T, collapse.se = T, p.style="stars", # one column (p: numeric, stars (asterisk), scientific, numeric_stars)
          digits=2, digits.p=2,
          show.r2= F, show.aic=T,show.loglik=F,show.dev=F,show.obs=T,
          file= "output/appendix/Table D6 Concreteness no outliers.html")





### Table D7: key cities and minority area ##########################

## Key cities
d_key_cities_remove = d |> subset(important_city==0)
d_key_cities_keep = d |> subset(important_city==1)

## Minority provinces
unique(d$province)
d_minority_prov_remove = d |> subset(!province %in% c("广西壮族自治区","宁夏回族自治区" ,"新疆维吾尔自治区" ,"内蒙古自治区","西藏自治区" ))
d_minority_prov_keep = d |> subset(province %in% c("广西壮族自治区","宁夏回族自治区" ,"新疆维吾尔自治区" ,"内蒙古自治区","西藏自治区" ))


## raw regression
summary(region1 <- glmer(concrete~complaint+ 
                           
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d_key_cities_remove,  # key cities removed
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


summary(region2 <- glmer(concrete~complaint+ 
                           
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d_key_cities_keep,  # only key cities
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

summary(region3 <- glmer(concrete~complaint+ 
                           
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d_minority_prov_remove,  # minority provinces removed
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


summary(region4 <- glmer(concrete~complaint+ 
                           
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d_minority_prov_keep,  # only minority provinces
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

## add controls

summary(region1b <- glmer(concrete~complaint+ 
                           post_length_log+phone+topic + # Thread controls
                           male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                           gdp_pc_log+pop_log+revenue_log+  # City econ control
                           replied_days+reply_length_log+   # Reply controls
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d_key_cities_remove,  # key cities removed
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


summary(region2b <- glmer(concrete~complaint+ 
                           post_length_log+phone+topic + # Thread controls
                           male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                           gdp_pc_log+pop_log+revenue_log+  # City econ control
                           replied_days+reply_length_log+   # Reply controls
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d_key_cities_keep,  # only key cities
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

summary(region3b <- glmer(concrete~complaint+ 
                           post_length_log+phone+topic + # Thread controls
                           male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                           gdp_pc_log+pop_log+revenue_log+  # City econ control
                           replied_days+reply_length_log+   # Reply controls
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d_minority_prov_remove,  # minority provinces removed
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


summary(region4b <- glmer(concrete~complaint+ 
                           post_length_log+phone+topic + # Thread controls
                           male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                           gdp_pc_log+pop_log+revenue_log+  # City econ control
                           replied_days+reply_length_log+   # Reply controls
                           (1|cityid)+(0+complaint|cityid), # City cluster
                         d_minority_prov_keep,  # only minority provinces
                         nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


## Save concreteness regression results
tab_model(region1,region1b,
          region2,region2b,
          region3,region3b, 
          region4,region4b,
          
          transform=NULL, # not to show odds ratio
          show.ci=F, show.se=T, collapse.se = T, p.style="stars", # one column (p: numeric, stars (asterisk), scientific, numeric_stars)
          digits=2, digits.p=2,
          show.r2= F, show.aic=T,show.loglik=F,show.dev=F,show.obs=T,
          file= "output/appendix/Table D7 Concreteness key cities and minorities.html")




## Unusual year issue ####


### Table D8 ####
range(d$reply_ym)
d_no_covid = d|>subset(reply_ym >= ym(202005))
d_2021 = d|>subset(reply_ym >= ym(202101))

#### Observations only between May 2020 and Dec 2021
summary(time1 <- glmer(concrete~complaint+ 
                         
                         (1|cityid)+(0+complaint|cityid), # City cluster
                       d_no_covid,  # no outliers
                       nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

# control = glmerControl(optimizer = "nloptwrap", optCtrl = list(maxeval = 10000), calc.derivs = FALSE)
summary(time2 <- glmer(concrete~complaint+
                         post_length_log+phone+topic + # Thread controls
                         male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                         gdp_pc_log+pop_log+revenue_log+  # City econ control
                         replied_days+reply_length_log+   # Reply controls
                         (1|cityid)+(0+complaint|cityid), # City cluster
                       d_no_covid,  # no outliers
                       nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

#### Observations only in 2021
summary(time3 <- glmer(concrete~complaint+ 
                         (1|cityid)+(0+complaint|cityid), # City cluster
                       d_2021,  # no outliers
                       nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))

# control = glmerControl(optimizer = "nloptwrap", optCtrl = list(maxeval = 10000), calc.derivs = FALSE)
summary(time4 <- glmer(concrete~complaint+
                         post_length_log+phone+topic + # Thread controls
                         male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                         gdp_pc_log+pop_log+revenue_log+  # City econ control
                         replied_days+reply_length_log+   # Reply controls
                         (1|cityid)+(0+complaint|cityid), # City cluster
                       d_2021,  # no outliers
                       nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))



## Save concreteness regression results
tab_model(time1,time2,time3,time4,
          transform=NULL, # not to show odds ratio
          show.ci=F, show.se=T, collapse.se = T, p.style="stars", # one column (p: numeric, stars (asterisk), scientific, numeric_stars)
          digits=2, digits.p=2,
          show.r2= F, show.aic=T,show.loglik=F,show.dev=F,show.obs=T,
          file= "output/appendix/Table D8 Timing test.html")



## Request for high-quality replies ####
### Figures D2 to D5 
### Tables D9 to D10
# See the end of this appendix code file. 






#_____________####
#--------------------------------------------------------------------------------#
# Online Appendix E: Reasons of Prioritizing Complaints -------------------------------------------------------
#--------------------------------------------------------------------------------#

## Table E1: Each factor itself as IV ####

## Negative information collection
summary(d$normalized_negative_score)
summary(why1 <- glmer(concrete~ normalized_negative_score+ 
                        
                        (1|cityid)+(0+normalized_negative_score|cityid), # City cluster
                      d,
                      nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


## Performing citizenship
summary(d$subject_citizenship) # 0.2231
summary(d$legal_citizenship) # 0.1872 # similar to their 0.14 (performing citizenship article)
summary(d$socialist_citizenship) # 0.3204


# add subjecthood_citizenship
summary(why2 <- glmer(concrete~ subject_citizenship+
                        
                        (1|cityid)+(0+subject_citizenship|cityid), # City cluster
                      d,
                      nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


## Risk aversion

### claims containing protest keywords
table(d$protest_risk)
summary(why3 <- glmer(concrete~ protest_risk+ # @
                        
                        (1|cityid)+(0+protest_risk|cityid), # City cluster
                      d,
                      nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


### From 12345 hotline
table(d$from_hotline12345) # 0: 225851   1: 7153 
summary(why4 <- glmer(concrete~ from_hotline12345+ 
                        
                        (1|cityid)+(0+from_hotline12345|cityid), # City cluster
                      d,
                      nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


## Add Controls
## Negative information collection
summary(d$normalized_negative_score)
summary(why1b <- glmer(concrete~ normalized_negative_score+ 
                         post_length_log+phone+topic+ # Thread controls
                         male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                         gdp_pc_log+pop_log+revenue_log+  # City econ controls
                         replied_days+reply_length_log+ # Reply controls
                         (1|cityid)+(0+normalized_negative_score|cityid), # City cluster
                       d,
                       nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


## Performing citizenship
summary(d$subject_citizenship) # 0.2231
summary(d$legal_citizenship) # 0.1872 # similar to their 0.14 (performing citizenship article)
summary(d$socialist_citizenship) # 0.3204


# Add subjecthood_citizenship
summary(why2b <- glmer(concrete~ subject_citizenship+
                         post_length_log+phone+topic+ # Thread controls
                         male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                         gdp_pc_log+pop_log+revenue_log+  # City econ controls
                         replied_days+reply_length_log+ # Reply controls
                         (1|cityid)+(0+subject_citizenship|cityid), # City cluster
                       d,
                       nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


## Risk aversion

### claims containing protest keywords
table(d$protest_risk)
summary(why3b <- glmer(concrete~ protest_risk+ # @
                         post_length_log+phone+topic+ # Thread controls
                         male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                         gdp_pc_log+pop_log+revenue_log+  # City econ controls
                         replied_days+reply_length_log+ # Reply controls
                         (1|cityid)+(0+protest_risk|cityid), # City cluster
                       d,
                       nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


### From 12345 hotline
table(d$from_hotline12345)
summary(why4b <- glmer(concrete~ from_hotline12345+ # @
                         post_length_log+phone+topic+ # Thread controls
                         male+age+han_race+edu_shuangyiliu_bin+edu_phd+tenure_length_m+ # Pref leader controls
                         gdp_pc_log+pop_log+revenue_log+  # City econ controls
                         replied_days+reply_length_log+ # Reply controls
                         (1|cityid)+(0+from_hotline12345|cityid), # City cluster
                       d,
                       nAGQ = 0, control = glmerControl(optimizer = "nloptwrap", calc.derivs = FALSE),family=binomial))


keep1 = c(
  "normalized_negative_score",
  "subject_citizenship","legal_citizenship","socialist_citizenship",
  "protest_risk","from_hotline12345"
)
sjPlot::tab_model(why1,why1b,
                  why2,why2b,
                  why3,why3b,
                  why4,why4b,
                  transform=NULL, # not to show odds ratio
                  show.ci=F, show.se=T, collapse.se = T, p.style="stars", # one column (p: numeric, stars (asterisk), scientific, numeric_stars)
                  digits=2, digits.p=2,
                  terms= keep1, # Only present the interested predictors
                  show.r2= F, show.aic=T,show.loglik=F,show.dev=F,show.obs=T,
                  file= "output/appendix/Table E1 Each element itself.html") 




## Figure E1: Correlation between complaint and each factor ####

source("data/stat_binscatter.R")
library(miceadds)

## Complaint and negative sentiment ####
cor_value <- cor(d$complaint, d$normalized_negative_score, method = "pearson")

summary(d$normalized_negative_score)
summary(d$post_negative_n)
table(d$complaint)
summary(f1<-lm.cluster( complaint  ~  normalized_negative_score

               ,cluster="cityid",data=d))


txt1<-expression(beta ==  0.82~"***")

p1=ggplot(d, aes(y=normalized_negative_score,x=complaint_f )) +
  stat_binscatter(bins = 1000,geom="pointrange")+
  annotate(geom="text",x=1,y=1,label=as.character(txt1), size=6, parse=T)+
  xlab("Complaint")+ylab("Negative sentiment score")+
  theme_classic()


## Complaint and subjecthood citizenship ####
cor_value <- cor(d$complaint, d$subject_citizenship, method = "pearson")

summary(d$subject_citizenship)
table(d$complaint)

summary(f1<-glm.cluster(subject_citizenship ~ complaint

               ,cluster="cityid",family="gaussian", data=d) )

txt1<-expression(beta ==  -0.07~"***")

p2=ggplot(d, aes(y=subject_citizenship,x=complaint_f )) +
  stat_binscatter(bins = 1000,geom="pointrange")+
  annotate(geom="text",x=1,y=1,label=as.character(txt1), size=6, parse=T)+
  xlab("Complaint")+ylab("Subjecthood citizenship")+
  theme_classic()




## Complaint and from_hotline12345 ####
summary(d$from_hotline12345)
table(d$complaint)
f1<-lm.cluster(from_hotline12345 ~  complaint

               ,cluster="cityid",data=d)

summary(f1)
txt1<-expression(beta ==  0.03~"***")

p3=ggplot(d, aes(y=from_hotline12345,x=complaint_f )) +
  stat_binscatter(bins = 1000,geom="pointrange")+
  annotate(geom="text",x=1,y=1,label=as.character(txt1), size=6, parse=T)+
  xlab("Complaint")+ylab("From 12345 hotline")+
  theme_classic()


## Complaint and protest_risk ####
summary(d$protest_risk)
table(d$complaint)
f1<-lm.cluster(protest_risk ~  complaint

               ,cluster="cityid",data=d)

summary(f1)
txt1<-expression(beta ==  0.07~"***")

p4=ggplot(d, aes(y=protest_risk,x=complaint_f )) +
  stat_binscatter(bins = 1000,geom="pointrange")+
  annotate(geom="text",x=1,y=1,label=as.character(txt1), size=6, parse=T)+
  xlab("Complaint")+ylab("Protest Risk")+
  theme_classic()



ggpubr::ggarrange(p1,p2,p3,p4,
                  labels = c("A", "B", "C", "D"),
                  ncol = 2, nrow = 2) 
ggsave(file="output/appendix/Figure E1 Correlations.pdf",width=12,height=10)






#_____________####
#--------------------------------------------------------------------------------#
# Online Appendix F: Evidence from Wuhan ####
#--------------------------------------------------------------------------------#
# See "3.Extension_Wuhan.R"














#_____________####
#--------------------------------------------------------------------------------#
# Online Appendix D part 2: Request for high-quality replies ####
#--------------------------------------------------------------------------------#
# Step 1: Randomly select 1000 cases. 
# Step 2: Two persons' independently manually label request_for_concreteness (whether a citizen request for concrete response). 
# Step 3: Check the quality of the two human labels. 
# Step 4: Analyse if complaints are more correlated with request_for_concreteness, which in  turn shapes the quality of government responsiveness


rm(list=ls())
#Set directory#
(code_file_path = rstudioapi::getActiveDocumentContext()$path) # Get current path of this code file
setwd(dirname(code_file_path)) # dirname(): reach the upper level folder
getwd() # current directory

#Set packages#
library(ggplot2)
library(tidyverse)
require(quanteda)
require(quanteda.textmodels)
require(caret)
require(extrafont) #version 0.18
loadfonts() # load font
# Solve the problem of conflict packages
library(conflicted)
conflict_prefer("select", "dplyr") # To ensure that when using select, it refers to dplyr

## Step 1: Randomly sampling 1000 observations from the database ----------------------------------------------------------------------------------------------------------------

# Import the data
load(file="data/llmb_multilevel_data_final.Rdata")
data <- llmb_multilevel_data_final
data$row_id = 1:nrow(data)

# Subset rows
set.seed(1) # Ensure the replication of sample()
index=sample(1:nrow(data),1000,replace=F)
head(index); tail(index)

sample_1000=data[index,]
dim(sample_1000) # 1000   154
save(sample_1000,file="data/random_sampled/sample_1000.Rdata")


## Check the quality of random sample -------------------------------------------
load(file="data/random_sampled/sample_1000.Rdata")
df = sample_1000

load(file="data/llmb_multilevel_data_final.Rdata")
data <- llmb_multilevel_data_final

### Histgraph plot of DV and IV ####
table(df$complaint)
# Histogram for 'complaint' in df
p_complaint_df <- ggplot(df, aes(x = complaint)) +
  geom_histogram(binwidth = 1, color = "white", fill = "#3a554c", aes(y = ..count..)) +  # Ensure y aesthetic is set to count for labeling
  geom_text(stat = "bin", aes(label = ..count.., y = ..count.. + 0.5), vjust = -0.1, binwidth = 1) +  # Adjust position with vjust
  theme_minimal() +
  scale_x_continuous(breaks = c(0, 1), labels = c("0", "1")) +
  labs(title = "Histogram of Complaints in 1000 Cases", x = "Complaint or not", y = "Frequency")

# Histogram for 'complaint' in data
table(data$complaint)
p_complaint_data <- ggplot(data, aes(x = complaint)) +
  geom_histogram(binwidth = 1, color = "white", fill = "#e86655", aes(y = ..count..)) +  # Ensure y aesthetic is set to count for labeling
  geom_text(stat = "bin", aes(label = ..count.., y = ..count.. + 0.5), vjust = -0.1, binwidth = 1) +  # Adjust position with vjust
  theme_minimal() +
  scale_x_continuous(breaks = c(0, 1), labels = c("0", "1")) +
  labs(title = "Histogram of Complaints in Full Data", x = "Complaint or not", y = "Frequency")


# Histogram for 'concrete' in df
table(df$concrete)
p_concrete_df <- ggplot(df, aes(x = concrete)) +
  geom_histogram(binwidth = 1, color = "white", fill = "#f8b781", aes(y = ..count..)) +  # Ensure y aesthetic is set to count for labeling
  geom_text(stat = "bin", aes(label = ..count.., y = ..count.. + 0.5), vjust = -0.1, binwidth = 1) +  # Adjust position with vjust
  theme_minimal() +
  scale_x_continuous(breaks = c(0, 1), labels = c("0", "1")) +
  labs(title = "Histogram of Concrete Responses in 1000 Cases", x = "Concrete reply or not", y = "Frequency")


# Histogram for 'concrete' in data
table(data$concrete)
p_concrete_data <- ggplot(data, aes(x = concrete)) +
  geom_histogram(binwidth = 1, color = "white", fill = "#1c4354", aes(y = ..count..)) +  # Ensure y aesthetic is set to count for labeling
  geom_text(stat = "bin", aes(label = ..count.., y = ..count.. + 0.5), vjust = -0.1, binwidth = 1) +  # Adjust position with vjust
  theme_minimal() +
  scale_x_continuous(breaks = c(0, 1), labels = c("0", "1")) +
  labs(title = "Histogram of Concrete Responses in Full Data", x = "Concrete reply or not", y = "Frequency")

#### Figure D2 ####
ggpubr::ggarrange(p_complaint_df,p_complaint_data,p_concrete_df,p_concrete_data,
                  labels = c("A1", "A2", "B1", "B2"),
                  font.label = list(size = 12, color = "#262a2b", face = "bold", family = NULL),
                  ncol = 2, nrow = 2)

ggsave(file="output/appendix/Figure D2 IV and DV.pdf",width=10,height=8) 




### Monthly post and reply number ####
# monthly posts of sampled 1000 cases
post_daily = df %>% # sampled 1000 cases
  subset(post_ymd>=ymd(20200101)) |>
  dplyr::group_by(post_ym) %>%
  dplyr::summarise(day_post_n=n())


s1=post_daily |>
  ggplot(aes(x=post_ym, y=day_post_n)) + 
  geom_line() + 
  theme_bw(base_size=16)+
  xlab("From Jan. 2020 to Dec. 2021")+ylab("Number of posts")+ggtitle("1000 Cases")+
  geom_line(size = 0.5) + scale_x_date(date_labels = "%b", date_breaks = "3 month")+
  #theme(axis.text.x = element_text(angle = 90))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16, colour = "black"),
        axis.title.x = element_text(size=16, colour = "black"), 
        axis.title.y = element_text(size=16, colour = "black"), 
        axis.text.x  = element_text(size=16, colour = "black"), 
        axis.text.y = element_text(size=16, colour = "black"))

# monthly post of full data
s2=data %>%  # data: full data 
  subset(post_ymd>=ymd(20200101)) |>
  dplyr::group_by(post_ym) %>%
  dplyr::summarise(day_post_n=n()) |>
  ggplot(aes(x=post_ym, y=day_post_n)) + 
  geom_line() + 
  theme_bw(base_size=16)+
  xlab("From Jan. 2020 to Dec. 2021")+ylab("Number of posts")+ggtitle("Full Data")+
  geom_line(size = 0.5) + scale_x_date(date_labels = "%b", date_breaks = "3 month")+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16, colour = "black"),
        axis.title.x = element_text(size=16, colour = "black"), 
        axis.title.y = element_text(size=16, colour = "black"), 
        axis.text.x  = element_text(size=16, colour = "black"), 
        axis.text.y = element_text(size=16, colour = "black"))



# monthly replies of sampled 1000 cases
s3=df %>% # df: sampled 1000 cases
  dplyr::group_by(reply_ym) %>%
  dplyr::summarise(day_reply_n=n()) |> # daily reply
  ggplot(aes(x=reply_ym, y=day_reply_n)) + 
  geom_line() + 
  theme_bw(base_size=16)+
  xlab("From Jan. 2020 to Dec. 2021")+ylab("Number of replies")+ggtitle("1000 Cases")+
  geom_line(size = 0.5) + scale_x_date(date_labels = "%b", date_breaks = "3 month")+
  #theme(axis.text.x = element_text(angle = 90))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16, colour = "black"),
        axis.title.x = element_text(size=16, colour = "black"), 
        axis.title.y = element_text(size=16, colour = "black"), 
        axis.text.x  = element_text(size=16, colour = "black"), 
        axis.text.y = element_text(size=16, colour = "black"))

# monthly replies of full data 
s4=data %>%  # data: full data 
  dplyr::group_by(reply_ym) %>%
  dplyr::summarise(day_reply_n=n()) |>
  ggplot(aes(x=reply_ym, y=day_reply_n)) + 
  geom_line() + 
  theme_bw(base_size=16)+
  xlab("From Jan. 2020 to Dec. 2021")+ylab("Number of replies")+ggtitle("Full Data")+
  geom_line(size = 0.5) + scale_x_date(date_labels = "%b", date_breaks = "3 month")+
  #theme(axis.text.x = element_text(angle = 90))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16, colour = "black"),
        axis.title.x = element_text(size=16, colour = "black"), 
        axis.title.y = element_text(size=16, colour = "black"), 
        axis.text.x  = element_text(size=16, colour = "black"), 
        axis.text.y = element_text(size=16, colour = "black"))

#### Figure D3 ####
ggpubr::ggarrange(s1,s2,s3,s4,
                  labels = c("A1", "A2", "B1", "B2"),
                  font.label = list(size = 12, color = "blue", face = "bold", family = NULL),
                  ncol = 2, nrow = 2)


ggsave(file="output/appendix/Figure D3 The number of post and reply, sample vs. full sample.pdf",width=15.5,height=9) 



### Monthly number of complaints ####

# monthly complaint ratio of sampled 1000 cases
temp=df %>% # df: sampled 1000 cases
  dplyr::group_by(post_ym) %>%
  dplyr::summarise(complaint_n=sum(complaint,na.rm = T) )

s5 = temp |> 
  ggplot(aes(x=post_ym, y=complaint_n)) + 
  geom_line() + 
  theme_bw(base_size=16)+
  xlab("From Jan. 2020 to Dec. 2021")+ylab("Complaint Number")+ggtitle("1000 Cases")+
  geom_line(size = 0.5) + scale_x_date(date_labels = "%b", date_breaks = "3 month")+
  #theme(axis.text.x = element_text(angle = 90))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16, colour = "black"),
        axis.title.x = element_text(size=16, colour = "black"), 
        axis.title.y = element_text(size=16, colour = "black"), 
        axis.text.x  = element_text(size=16, colour = "black"), 
        axis.text.y = element_text(size=16, colour = "black"))



temp=data %>% # data: full data
  dplyr::group_by(post_ym) %>%
  dplyr::summarise(complaint_n=sum(complaint,na.rm = T) )

s6 = temp |> 
  ggplot(aes(x=post_ym, y=complaint_n)) + 
  geom_line() + 
  theme_bw(base_size=16)+
  xlab("From Jan. 2020 to Dec. 2021")+ylab("Complaint Number")+ggtitle("Full Data")+
  geom_line(size = 0.5) + scale_x_date(date_labels = "%b", date_breaks = "3 month")+
  #theme(axis.text.x = element_text(angle = 90))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16, colour = "black"),
        axis.title.x = element_text(size=16, colour = "black"), 
        axis.title.y = element_text(size=16, colour = "black"), 
        axis.text.x  = element_text(size=16, colour = "black"), 
        axis.text.y = element_text(size=16, colour = "black"))

#### Figure D4 ####
ggpubr::ggarrange(s5,s6,
                  labels = c("C1", "C2"),
                  font.label = list(size = 12, color = "blue", face = "bold"),
                  ncol = 2, nrow = 1)

ggsave("output/appendix/Figure D4 The number of post and reply, sample vs. full sample.pdf",width=15.5,height=4) 





## Select columns for human labeling--------------------------------------------
load(file="data/random_sampled/sample_1000.Rdata")
length(unique(sample_1000$tid))

llmb_1000_for_label <- sample_1000 |>
  select(
    # id
    row_id,tid,
    # post,
    post_ymd, post_ym, original_post, 
    # reply 
    reply_ymd, reply_ym, reply_content
  )

glimpse(llmb_1000_for_label)
write.csv(llmb_1000_for_label,"data/random_sampled/llmb_1000_for_label.csv",row.names=F)


## Step 2: Label ------------------------------------------------------------------------
# Manually coding request_for_concreteness (whether a citizen request for concrete response)
# For these observations the two coders gave different coding results, the two coders discussed it to reach a agreed coding. 
df1 <- readxl::read_excel("data/random_sampled/llmb_1000_for_label v1_coding1.xlsx")
df2 <- readxl::read_excel("data/random_sampled/llmb_1000_for_label v1_coding2.xlsx")


# For those the two coders gave a different coding:
dt = data.frame(tid = df1$tid, original_post = df1$original_post,
                request_for_concreteness1=df1$is_request_for_concreteness, 
                request_for_concreteness2=df2$is_request_for_concreteness)

dt = dt |> mutate(
  debatable = ifelse(request_for_concreteness1 != request_for_concreteness2, 1, 0),
  discussed = NA
)

table(dt$debatable)

llmb_1000_coding_combined = dt 
openxlsx::write.xlsx(llmb_1000_coding_combined, "data/random_sampled/llmb_1000_coding_combined.xlsx", na="")
# After discussing, the two coders got a new excel named "llmb_1000_coding_combined_discussed.xlsx"

## Step 3: Check the quality of labeling ------------------------------------------------------------------------
df1 <- readxl::read_excel("data/random_sampled/llmb_1000_for_label v1_coding1.xlsx")
df2 <- readxl::read_excel("data/random_sampled/llmb_1000_for_label v1_coding2.xlsx")
names(df1)
table(df1$tid == df2$tid) # all tid are the same between the two excels

# Cohen's Kappa
library(psych)  # This package contains the kappa2 function
contingency_table <- table(df1$is_request_for_concreteness, df2$is_request_for_concreteness)
contingency_table

# Calculate Cohen's Kappa
kappa_result <- cohen.kappa(contingency_table)
kappa_result


## Step 4: Results ------------------------------------------------------------------------
load(file="data/random_sampled/sample_1000.Rdata")

df1 <- readxl::read_excel("data/random_sampled/llmb_1000_for_label v1_coding1.xlsx")
df2 <- readxl::read_excel("data/random_sampled/llmb_1000_for_label v1_coding2.xlsx")
df3 <-  readxl::read_excel("data/random_sampled/llmb_1000_coding_combined_discussed.xlsx")


table(df1$tid == df3$tid);table(df2$tid == df3$tid)
glimpse(df3)

# If missing, the coding should equal to one of the two coders' coding (the two are the same)
dt = df3 |> 
  mutate(request_for_concreteness = ifelse(is.na(discussed), request_for_concreteness1, discussed) ) 

# Select useful variables 
dt = dt |> 
  select(tid, request_for_concreteness1,request_for_concreteness2,request_for_concreteness)

data = left_join(sample_1000, dt, by="tid")




##### Figure D5 ####

df = data |> 
  select(complaint, 
         request_for_concreteness1, request_for_concreteness2, request_for_concreteness, 
         concrete)

df <- df %>%
  mutate(complaint = factor(complaint, levels = c(0, 1), labels = c("Non-Complaint", "Complaint")),
         request_for_concreteness1 = factor(request_for_concreteness1, levels = c(0, 1), labels = c("No", "Yes")),
         request_for_concreteness2 = factor(request_for_concreteness2, levels = c(0, 1), labels = c("No", "Yes")),
         request_for_concreteness = factor(request_for_concreteness, levels = c(0, 1), labels = c("No", "Yes") ) )
head(df)

table(df$request_for_concreteness1) # 815 
table(df$request_for_concreteness2) # 823
table(df$request_for_concreteness) # 833

# Bar plot for request_for_concreteness1
p1=ggplot(df, aes(x = complaint, fill = request_for_concreteness1)) +
  geom_bar(position = "fill") +
  labs(title = "Coding 1",
       x = "Complaint Status",
       y = "Proportion",
       fill = "Request for Concreteness\n(label 1)") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal()

# Bar plot for request_for_concreteness2
p2=ggplot(df, aes(x = complaint, fill = request_for_concreteness2)) +
  geom_bar(position = "fill") +
  labs(title = "Coding 2",
       x = "Complaint Status",
       y = "Proportion",
       fill = "Request for Concreteness\n(label 2)") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal()

p3=ggplot(df, aes(x = complaint, fill = request_for_concreteness)) +
  geom_bar(position = "fill") +
  labs(title = "Integrated Coding",
       x = "Complaint Status",
       y = "Proportion",
       fill = "Request for Concreteness\n(discussed)") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal()

ggpubr::ggarrange(p1,p2,p3,
                  labels = c("A", "B", "C"),
                  font.label = list(size = 12, color = "#262a2b", face = "bold"),
                  nrow = 1, ncol = 3)

ggsave("output/appendix/Figure D5 Complaint and request for concreteness.pdf",width=15,height=4) 





### Table D9: Complaint and request for concreteness ####
# Linear regression to check if complaints are more likely to request concrete replies (as labeled by RA1)
data = data |> mutate(
  request_for_concreteness1 = request_for_concreteness1 |> as.numeric(),
  request_for_concreteness2 = request_for_concreteness2 |> as.numeric(),
  request_for_concreteness = request_for_concreteness |> as.numeric()
)


summary(logistic1 <- glm(request_for_concreteness ~ complaint, data = data, family = binomial))


stargazer::stargazer(logistic1,
                     type = "text",
                     out = "output/appendix/Table D9 Complaint and request for concrete replies.html",
                     dep.var.labels.include = FALSE, dep.var.caption = "DV: Request for Concrete Reply (1=yes, 0=no)",
                     digits = 3,
                     star.cutoffs = c(0.1, 0.05, 0.01, 0.001),  # Set cutoffs for significance
                     star.char = c("+", "*", "**", "***"),  # Set characters for significance levels
                     
                     # Keep and label the independent variables and add checkmarks
                     keep = c("complaint"),
                     covariate.labels = c("Complaint (1=yes, 0=no)"),
                     keep.stat	= c("n"),
                     # notes
                     notes = "+ p<0.1, * p<0.05, ** p<0.01, *** p<0.001 (two-tailed test)",  # Custom note
                     notes.align = "l",
                     notes.append = FALSE  # Make sure to not append default notes
)



### Table D10: Accounting for request feature ####
# Logistic regression: to check if complaints are more likely to request concrete replies
summary(logistic1 <- glm(concrete ~ complaint, data = data, family = binomial))
summary(logistic2 <- glm(concrete ~ request_for_concreteness, data = data, family = binomial))
summary(logistic3 <- glm(concrete ~ complaint + request_for_concreteness, data = data, family = binomial))


stargazer::stargazer(logistic1, logistic2,logistic3,
                     
                     type = "text",
                     out = "output/appendix/Table D10 Controling request for concrete replies.html",
                     dep.var.labels.include = FALSE, dep.var.caption = "DV: Concrete Reply (1=yes, 0=no)",
                     digits = 3,
                     star.cutoffs = c(0.1, 0.05, 0.01, 0.001),  # Set cutoffs for significance
                     star.char = c("+", "*", "**", "***"),  # Set characters for significance levels
                     
                     # Keep and label the independent variables and add checkmarks
                     keep = c("complaint", "request_for_concreteness"),
                     covariate.labels = c("Complaint (1=yes, 0=no)", "Request for Concrete Reply (1=yes, 0=no)"),
                     keep.stat	= c("n"),
                     # notes
                     notes = "+ p<0.1, * p<0.05, ** p<0.01, *** p<0.001 (two-tailed test)",  # Custom note
                     notes.align = "l",
                     notes.append = FALSE  # Make sure to not append default notes
)





cat("The end at ", format(Sys.time(), "%d %b %Y %a %X"))


















